MSGSU
ISTATISTIK BOLUMU - R ILE ISTATISTIKSEL PROGRAMLAMA DERS NOTLARI
by ozge.ozdamar@msgsu.edu.tr is licensed under a
Creative
Commons Attribution-NonCommercial-ShareAlike 4.0 International
License.
Hata ve öneriler için ozge.ozdamar@msgsu.edu.tr
REFERENCES
LIBRARIES
rm(list=ls())
.packages = c("car","doBy","lubridate","VIM","mice", "Amelia","naniar","plyr","sqldf", "dplyr", "lubridate")
.inst <- .packages %in% installed.packages()
if(length(.packages[!.inst]) > 0) install.packages(.packages[!.inst])
lapply(.packages, require, character.only=TRUE)
FUNCTIONS
car::recode()
doBy::recodevar()
base::within(), cut(),
plyr::revalue(), mapvalues()
data
data(mtcars)
data(airquality)
veri<-data.frame(x1 = c(2, 2, 6, 4), x2 = c(3, 4, 2, 8))
veri
veri$sumx <- veri$x1 + veri$x2
veri$meanx <- (veri$x1 + veri$x2)/2
veri
attach(veri)
veri$sumx <- x1 + x2
veri$meanx <- (x1 + x2)/2
detach(veri)
veri <- base::transform(veri, sumx = x1 + x2, meanx = (x1 + x2)/2)
veri
Recoding involves creating new values of a variable conditional on the existing values of the same and/or other variables.
head(mtcars)
rank(mtcars$mpg)
## [1] 19.5 19.5 24.5 21.5 15.0 14.0 4.0 26.0 24.5 16.5 13.0 11.0 12.0 7.5 1.5
## [16] 1.5 5.0 31.0 29.5 32.0 23.0 9.0 7.5 3.0 16.5 28.0 27.0 29.5 10.0 18.0
## [31] 6.0 21.5
mtcars$mpgcat1[mtcars$mpg < 10] <-"A"
mtcars$mpgcat1[mtcars$mpg >= 10 & mtcars$mpg <= 20] <-"B"
mtcars$mpgcat1[mtcars$mpg > 20] <-"C"
mtcars
str(mtcars)
## 'data.frame': 32 obs. of 12 variables:
## $ mpg : num 21 21 22.8 21.4 18.7 18.1 14.3 24.4 22.8 19.2 ...
## $ cyl : num 6 6 4 6 8 6 8 4 4 6 ...
## $ disp : num 160 160 108 258 360 ...
## $ hp : num 110 110 93 110 175 105 245 62 95 123 ...
## $ drat : num 3.9 3.9 3.85 3.08 3.15 2.76 3.21 3.69 3.92 3.92 ...
## $ wt : num 2.62 2.88 2.32 3.21 3.44 ...
## $ qsec : num 16.5 17 18.6 19.4 17 ...
## $ vs : num 0 0 1 1 0 1 0 1 1 1 ...
## $ am : num 1 1 1 0 0 0 0 0 0 0 ...
## $ gear : num 4 4 4 3 3 3 3 4 4 4 ...
## $ carb : num 4 4 1 1 2 1 4 2 2 4 ...
## $ mpgcat1: chr "C" "C" "C" "C" ...
mtcars$mpgcat1 <- as.factor(mtcars$mpgcat1)
str(mtcars)
## 'data.frame': 32 obs. of 12 variables:
## $ mpg : num 21 21 22.8 21.4 18.7 18.1 14.3 24.4 22.8 19.2 ...
## $ cyl : num 6 6 4 6 8 6 8 4 4 6 ...
## $ disp : num 160 160 108 258 360 ...
## $ hp : num 110 110 93 110 175 105 245 62 95 123 ...
## $ drat : num 3.9 3.9 3.85 3.08 3.15 2.76 3.21 3.69 3.92 3.92 ...
## $ wt : num 2.62 2.88 2.32 3.21 3.44 ...
## $ qsec : num 16.5 17 18.6 19.4 17 ...
## $ vs : num 0 0 1 1 0 1 0 1 1 1 ...
## $ am : num 1 1 1 0 0 0 0 0 0 0 ...
## $ gear : num 4 4 4 3 3 3 3 4 4 4 ...
## $ carb : num 4 4 1 1 2 1 4 2 2 4 ...
## $ mpgcat1: Factor w/ 2 levels "B","C": 2 2 2 2 1 1 1 2 2 1 ...
mtcars<-within(mtcars,{
mpgcat2 <- NA
mpgcat2[mpg < 10] <- "A"
mpgcat2[mpg >= 10 & mpg <= 20] <- "B"
mpgcat2[mpg >20] <- "C" })
mtcars
str(mtcars)
## 'data.frame': 32 obs. of 13 variables:
## $ mpg : num 21 21 22.8 21.4 18.7 18.1 14.3 24.4 22.8 19.2 ...
## $ cyl : num 6 6 4 6 8 6 8 4 4 6 ...
## $ disp : num 160 160 108 258 360 ...
## $ hp : num 110 110 93 110 175 105 245 62 95 123 ...
## $ drat : num 3.9 3.9 3.85 3.08 3.15 2.76 3.21 3.69 3.92 3.92 ...
## $ wt : num 2.62 2.88 2.32 3.21 3.44 ...
## $ qsec : num 16.5 17 18.6 19.4 17 ...
## $ vs : num 0 0 1 1 0 1 0 1 1 1 ...
## $ am : num 1 1 1 0 0 0 0 0 0 0 ...
## $ gear : num 4 4 4 3 3 3 3 4 4 4 ...
## $ carb : num 4 4 1 1 2 1 4 2 2 4 ...
## $ mpgcat1: Factor w/ 2 levels "B","C": 2 2 2 2 1 1 1 2 2 1 ...
## $ mpgcat2: chr "C" "C" "C" "C" ...
mtcars$mpgcat2 <- as.factor(mtcars$mpgcat2)
str(mtcars)
## 'data.frame': 32 obs. of 13 variables:
## $ mpg : num 21 21 22.8 21.4 18.7 18.1 14.3 24.4 22.8 19.2 ...
## $ cyl : num 6 6 4 6 8 6 8 4 4 6 ...
## $ disp : num 160 160 108 258 360 ...
## $ hp : num 110 110 93 110 175 105 245 62 95 123 ...
## $ drat : num 3.9 3.9 3.85 3.08 3.15 2.76 3.21 3.69 3.92 3.92 ...
## $ wt : num 2.62 2.88 2.32 3.21 3.44 ...
## $ qsec : num 16.5 17 18.6 19.4 17 ...
## $ vs : num 0 0 1 1 0 1 0 1 1 1 ...
## $ am : num 1 1 1 0 0 0 0 0 0 0 ...
## $ gear : num 4 4 4 3 3 3 3 4 4 4 ...
## $ carb : num 4 4 1 1 2 1 4 2 2 4 ...
## $ mpgcat1: Factor w/ 2 levels "B","C": 2 2 2 2 1 1 1 2 2 1 ...
## $ mpgcat2: Factor w/ 2 levels "B","C": 2 2 2 2 1 1 1 2 2 1 ...
a1<-car::recode(mtcars$carb,"1=10;2=20;3=30;4=40;6=60;8=80")
head(cbind(mtcars$carb,a1),10)
## a1
## [1,] 4 40
## [2,] 4 40
## [3,] 1 10
## [4,] 1 10
## [5,] 2 20
## [6,] 1 10
## [7,] 4 40
## [8,] 2 20
## [9,] 2 20
## [10,] 4 40
# ?recode
# http://rprogramming.net/recode-data-in-r/
mtcars$carb2<-doBy::recodeVar(mtcars$carb,src=c(1:8),tgt=c("A","B","C","D","E","F","G","H"))
head(cbind(mtcars$carb2),10)
## [,1]
## [1,] "D"
## [2,] "D"
## [3,] "A"
## [4,] "A"
## [5,] "B"
## [6,] "A"
## [7,] "D"
## [8,] "B"
## [9,] "B"
## [10,] "D"
str(mtcars) # carb2 char
## 'data.frame': 32 obs. of 14 variables:
## $ mpg : num 21 21 22.8 21.4 18.7 18.1 14.3 24.4 22.8 19.2 ...
## $ cyl : num 6 6 4 6 8 6 8 4 4 6 ...
## $ disp : num 160 160 108 258 360 ...
## $ hp : num 110 110 93 110 175 105 245 62 95 123 ...
## $ drat : num 3.9 3.9 3.85 3.08 3.15 2.76 3.21 3.69 3.92 3.92 ...
## $ wt : num 2.62 2.88 2.32 3.21 3.44 ...
## $ qsec : num 16.5 17 18.6 19.4 17 ...
## $ vs : num 0 0 1 1 0 1 0 1 1 1 ...
## $ am : num 1 1 1 0 0 0 0 0 0 0 ...
## $ gear : num 4 4 4 3 3 3 3 4 4 4 ...
## $ carb : num 4 4 1 1 2 1 4 2 2 4 ...
## $ mpgcat1: Factor w/ 2 levels "B","C": 2 2 2 2 1 1 1 2 2 1 ...
## $ mpgcat2: Factor w/ 2 levels "B","C": 2 2 2 2 1 1 1 2 2 1 ...
## $ carb2 : chr "D" "D" "A" "A" ...
mtcars$carb2<- as.factor(mtcars$carb2)
str(mtcars)
## 'data.frame': 32 obs. of 14 variables:
## $ mpg : num 21 21 22.8 21.4 18.7 18.1 14.3 24.4 22.8 19.2 ...
## $ cyl : num 6 6 4 6 8 6 8 4 4 6 ...
## $ disp : num 160 160 108 258 360 ...
## $ hp : num 110 110 93 110 175 105 245 62 95 123 ...
## $ drat : num 3.9 3.9 3.85 3.08 3.15 2.76 3.21 3.69 3.92 3.92 ...
## $ wt : num 2.62 2.88 2.32 3.21 3.44 ...
## $ qsec : num 16.5 17 18.6 19.4 17 ...
## $ vs : num 0 0 1 1 0 1 0 1 1 1 ...
## $ am : num 1 1 1 0 0 0 0 0 0 0 ...
## $ gear : num 4 4 4 3 3 3 3 4 4 4 ...
## $ carb : num 4 4 1 1 2 1 4 2 2 4 ...
## $ mpgcat1: Factor w/ 2 levels "B","C": 2 2 2 2 1 1 1 2 2 1 ...
## $ mpgcat2: Factor w/ 2 levels "B","C": 2 2 2 2 1 1 1 2 2 1 ...
## $ carb2 : Factor w/ 6 levels "A","B","C","D",..: 4 4 1 1 2 1 4 2 2 4 ...
# ? recodeVar
mtcars$mpgcat3<-cut(mtcars$mpg,
breaks=c(-Inf,20,30,Inf),
labels = c("A","B","C"))
head(mtcars)
###plyr::revalue(), mapvalues()
mtcars$carb3<-plyr::revalue(as.factor(mtcars$carb),c("1"="A","2"="B","3"="C","4"="D","6"="E"))
mtcars$carb4<-plyr::mapvalues(as.factor(mtcars$carb),from = c("1","2","3","4","6"),to = c("A","B","C","D","E"))
head(mtcars)
fix(mtcars)
edit(mtcars)
data.entry(mtcars)
reshape::rename()
# rename(mtcars, c(wt = "weight", cyl = "cylinders"))
names(mtcars)
## [1] "mpg" "cyl" "disp" "hp" "drat" "wt" "qsec"
## [8] "vs" "am" "gear" "carb" "mpgcat1" "mpgcat2" "carb2"
## [15] "mpgcat3" "carb3" "carb4"
NA Not Available
x <- c(1, 99, 3, NA, 5, 5, NA, 99, 3, 3, NA, 1, 3, 5, 1, 1 )
is.na(x)
## [1] FALSE FALSE FALSE TRUE FALSE FALSE TRUE FALSE FALSE FALSE TRUE FALSE
## [13] FALSE FALSE FALSE FALSE
x < 3
## [1] TRUE FALSE FALSE NA FALSE FALSE NA FALSE FALSE FALSE NA TRUE
## [13] FALSE FALSE TRUE TRUE
x == 99
## [1] FALSE TRUE FALSE NA FALSE FALSE NA TRUE FALSE FALSE NA FALSE
## [13] FALSE FALSE FALSE FALSE
NA cannot be used in comparisons
x == NA
## [1] NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
sample data
data(airquality)
str(airquality)
## 'data.frame': 153 obs. of 6 variables:
## $ Ozone : int 41 36 12 18 NA 28 23 19 8 NA ...
## $ Solar.R: int 190 118 149 313 NA NA 299 99 19 194 ...
## $ Wind : num 7.4 8 12.6 11.5 14.3 14.9 8.6 13.8 20.1 8.6 ...
## $ Temp : int 67 72 74 62 56 66 65 59 61 69 ...
## $ Month : int 5 5 5 5 5 5 5 5 5 5 ...
## $ Day : int 1 2 3 4 5 6 7 8 9 10 ...
is.na(airquality)
## Ozone Solar.R Wind Temp Month Day
## [1,] FALSE FALSE FALSE FALSE FALSE FALSE
## [2,] FALSE FALSE FALSE FALSE FALSE FALSE
## [3,] FALSE FALSE FALSE FALSE FALSE FALSE
## [4,] FALSE FALSE FALSE FALSE FALSE FALSE
## [5,] TRUE TRUE FALSE FALSE FALSE FALSE
## [6,] FALSE TRUE FALSE FALSE FALSE FALSE
## [7,] FALSE FALSE FALSE FALSE FALSE FALSE
## [8,] FALSE FALSE FALSE FALSE FALSE FALSE
## [9,] FALSE FALSE FALSE FALSE FALSE FALSE
## [10,] TRUE FALSE FALSE FALSE FALSE FALSE
## [11,] FALSE TRUE FALSE FALSE FALSE FALSE
## [12,] FALSE FALSE FALSE FALSE FALSE FALSE
## [13,] FALSE FALSE FALSE FALSE FALSE FALSE
## [14,] FALSE FALSE FALSE FALSE FALSE FALSE
## [15,] FALSE FALSE FALSE FALSE FALSE FALSE
## [16,] FALSE FALSE FALSE FALSE FALSE FALSE
## [17,] FALSE FALSE FALSE FALSE FALSE FALSE
## [18,] FALSE FALSE FALSE FALSE FALSE FALSE
## [19,] FALSE FALSE FALSE FALSE FALSE FALSE
## [20,] FALSE FALSE FALSE FALSE FALSE FALSE
## [21,] FALSE FALSE FALSE FALSE FALSE FALSE
## [22,] FALSE FALSE FALSE FALSE FALSE FALSE
## [23,] FALSE FALSE FALSE FALSE FALSE FALSE
## [24,] FALSE FALSE FALSE FALSE FALSE FALSE
## [25,] TRUE FALSE FALSE FALSE FALSE FALSE
## [26,] TRUE FALSE FALSE FALSE FALSE FALSE
## [27,] TRUE TRUE FALSE FALSE FALSE FALSE
## [28,] FALSE FALSE FALSE FALSE FALSE FALSE
## [29,] FALSE FALSE FALSE FALSE FALSE FALSE
## [30,] FALSE FALSE FALSE FALSE FALSE FALSE
## [31,] FALSE FALSE FALSE FALSE FALSE FALSE
## [32,] TRUE FALSE FALSE FALSE FALSE FALSE
## [33,] TRUE FALSE FALSE FALSE FALSE FALSE
## [34,] TRUE FALSE FALSE FALSE FALSE FALSE
## [35,] TRUE FALSE FALSE FALSE FALSE FALSE
## [36,] TRUE FALSE FALSE FALSE FALSE FALSE
## [37,] TRUE FALSE FALSE FALSE FALSE FALSE
## [38,] FALSE FALSE FALSE FALSE FALSE FALSE
## [39,] TRUE FALSE FALSE FALSE FALSE FALSE
## [40,] FALSE FALSE FALSE FALSE FALSE FALSE
## [41,] FALSE FALSE FALSE FALSE FALSE FALSE
## [42,] TRUE FALSE FALSE FALSE FALSE FALSE
## [43,] TRUE FALSE FALSE FALSE FALSE FALSE
## [44,] FALSE FALSE FALSE FALSE FALSE FALSE
## [45,] TRUE FALSE FALSE FALSE FALSE FALSE
## [46,] TRUE FALSE FALSE FALSE FALSE FALSE
## [47,] FALSE FALSE FALSE FALSE FALSE FALSE
## [48,] FALSE FALSE FALSE FALSE FALSE FALSE
## [49,] FALSE FALSE FALSE FALSE FALSE FALSE
## [50,] FALSE FALSE FALSE FALSE FALSE FALSE
## [51,] FALSE FALSE FALSE FALSE FALSE FALSE
## [52,] TRUE FALSE FALSE FALSE FALSE FALSE
## [53,] TRUE FALSE FALSE FALSE FALSE FALSE
## [54,] TRUE FALSE FALSE FALSE FALSE FALSE
## [55,] TRUE FALSE FALSE FALSE FALSE FALSE
## [56,] TRUE FALSE FALSE FALSE FALSE FALSE
## [57,] TRUE FALSE FALSE FALSE FALSE FALSE
## [58,] TRUE FALSE FALSE FALSE FALSE FALSE
## [59,] TRUE FALSE FALSE FALSE FALSE FALSE
## [60,] TRUE FALSE FALSE FALSE FALSE FALSE
## [61,] TRUE FALSE FALSE FALSE FALSE FALSE
## [62,] FALSE FALSE FALSE FALSE FALSE FALSE
## [63,] FALSE FALSE FALSE FALSE FALSE FALSE
## [64,] FALSE FALSE FALSE FALSE FALSE FALSE
## [65,] TRUE FALSE FALSE FALSE FALSE FALSE
## [66,] FALSE FALSE FALSE FALSE FALSE FALSE
## [67,] FALSE FALSE FALSE FALSE FALSE FALSE
## [68,] FALSE FALSE FALSE FALSE FALSE FALSE
## [69,] FALSE FALSE FALSE FALSE FALSE FALSE
## [70,] FALSE FALSE FALSE FALSE FALSE FALSE
## [71,] FALSE FALSE FALSE FALSE FALSE FALSE
## [72,] TRUE FALSE FALSE FALSE FALSE FALSE
## [73,] FALSE FALSE FALSE FALSE FALSE FALSE
## [74,] FALSE FALSE FALSE FALSE FALSE FALSE
## [75,] TRUE FALSE FALSE FALSE FALSE FALSE
## [76,] FALSE FALSE FALSE FALSE FALSE FALSE
## [77,] FALSE FALSE FALSE FALSE FALSE FALSE
## [78,] FALSE FALSE FALSE FALSE FALSE FALSE
## [79,] FALSE FALSE FALSE FALSE FALSE FALSE
## [80,] FALSE FALSE FALSE FALSE FALSE FALSE
## [81,] FALSE FALSE FALSE FALSE FALSE FALSE
## [82,] FALSE FALSE FALSE FALSE FALSE FALSE
## [83,] TRUE FALSE FALSE FALSE FALSE FALSE
## [84,] TRUE FALSE FALSE FALSE FALSE FALSE
## [85,] FALSE FALSE FALSE FALSE FALSE FALSE
## [86,] FALSE FALSE FALSE FALSE FALSE FALSE
## [87,] FALSE FALSE FALSE FALSE FALSE FALSE
## [88,] FALSE FALSE FALSE FALSE FALSE FALSE
## [89,] FALSE FALSE FALSE FALSE FALSE FALSE
## [90,] FALSE FALSE FALSE FALSE FALSE FALSE
## [91,] FALSE FALSE FALSE FALSE FALSE FALSE
## [92,] FALSE FALSE FALSE FALSE FALSE FALSE
## [93,] FALSE FALSE FALSE FALSE FALSE FALSE
## [94,] FALSE FALSE FALSE FALSE FALSE FALSE
## [95,] FALSE FALSE FALSE FALSE FALSE FALSE
## [96,] FALSE TRUE FALSE FALSE FALSE FALSE
## [97,] FALSE TRUE FALSE FALSE FALSE FALSE
## [98,] FALSE TRUE FALSE FALSE FALSE FALSE
## [99,] FALSE FALSE FALSE FALSE FALSE FALSE
## [100,] FALSE FALSE FALSE FALSE FALSE FALSE
## [101,] FALSE FALSE FALSE FALSE FALSE FALSE
## [102,] TRUE FALSE FALSE FALSE FALSE FALSE
## [103,] TRUE FALSE FALSE FALSE FALSE FALSE
## [104,] FALSE FALSE FALSE FALSE FALSE FALSE
## [105,] FALSE FALSE FALSE FALSE FALSE FALSE
## [106,] FALSE FALSE FALSE FALSE FALSE FALSE
## [107,] TRUE FALSE FALSE FALSE FALSE FALSE
## [108,] FALSE FALSE FALSE FALSE FALSE FALSE
## [109,] FALSE FALSE FALSE FALSE FALSE FALSE
## [110,] FALSE FALSE FALSE FALSE FALSE FALSE
## [111,] FALSE FALSE FALSE FALSE FALSE FALSE
## [112,] FALSE FALSE FALSE FALSE FALSE FALSE
## [113,] FALSE FALSE FALSE FALSE FALSE FALSE
## [114,] FALSE FALSE FALSE FALSE FALSE FALSE
## [115,] TRUE FALSE FALSE FALSE FALSE FALSE
## [116,] FALSE FALSE FALSE FALSE FALSE FALSE
## [117,] FALSE FALSE FALSE FALSE FALSE FALSE
## [118,] FALSE FALSE FALSE FALSE FALSE FALSE
## [119,] TRUE FALSE FALSE FALSE FALSE FALSE
## [120,] FALSE FALSE FALSE FALSE FALSE FALSE
## [121,] FALSE FALSE FALSE FALSE FALSE FALSE
## [122,] FALSE FALSE FALSE FALSE FALSE FALSE
## [123,] FALSE FALSE FALSE FALSE FALSE FALSE
## [124,] FALSE FALSE FALSE FALSE FALSE FALSE
## [125,] FALSE FALSE FALSE FALSE FALSE FALSE
## [126,] FALSE FALSE FALSE FALSE FALSE FALSE
## [127,] FALSE FALSE FALSE FALSE FALSE FALSE
## [128,] FALSE FALSE FALSE FALSE FALSE FALSE
## [129,] FALSE FALSE FALSE FALSE FALSE FALSE
## [130,] FALSE FALSE FALSE FALSE FALSE FALSE
## [131,] FALSE FALSE FALSE FALSE FALSE FALSE
## [132,] FALSE FALSE FALSE FALSE FALSE FALSE
## [133,] FALSE FALSE FALSE FALSE FALSE FALSE
## [134,] FALSE FALSE FALSE FALSE FALSE FALSE
## [135,] FALSE FALSE FALSE FALSE FALSE FALSE
## [136,] FALSE FALSE FALSE FALSE FALSE FALSE
## [137,] FALSE FALSE FALSE FALSE FALSE FALSE
## [138,] FALSE FALSE FALSE FALSE FALSE FALSE
## [139,] FALSE FALSE FALSE FALSE FALSE FALSE
## [140,] FALSE FALSE FALSE FALSE FALSE FALSE
## [141,] FALSE FALSE FALSE FALSE FALSE FALSE
## [142,] FALSE FALSE FALSE FALSE FALSE FALSE
## [143,] FALSE FALSE FALSE FALSE FALSE FALSE
## [144,] FALSE FALSE FALSE FALSE FALSE FALSE
## [145,] FALSE FALSE FALSE FALSE FALSE FALSE
## [146,] FALSE FALSE FALSE FALSE FALSE FALSE
## [147,] FALSE FALSE FALSE FALSE FALSE FALSE
## [148,] FALSE FALSE FALSE FALSE FALSE FALSE
## [149,] FALSE FALSE FALSE FALSE FALSE FALSE
## [150,] TRUE FALSE FALSE FALSE FALSE FALSE
## [151,] FALSE FALSE FALSE FALSE FALSE FALSE
## [152,] FALSE FALSE FALSE FALSE FALSE FALSE
## [153,] FALSE FALSE FALSE FALSE FALSE FALSE
NA + anything = NA
NaN : Not a Number any_na(NaN) any_na(NULL) any_na(Inf)
NA | TRUE
## [1] TRUE
NA | FALSE
## [1] NA
NA + NaN
## [1] NA
NaN + NA
## [1] NA
x[x == 99] <- NA
x
## [1] 1 NA 3 NA 5 5 NA NA 3 3 NA 1 3 5 1 1
na.omit, na.exclude, na.pass, na.fail, na.action, na.rm, na.last, useNA
mean(airquality$Ozone)
## [1] NA
mean(airquality$Ozone,na.rm=TRUE)
## [1] 42.12931
mean(na.omit(airquality$Ozone))
## [1] 42.12931
na.omit(airquality$Ozone)
## [1] 41 36 12 18 28 23 19 8 7 16 11 14 18 14 34 6 30 11
## [19] 1 11 4 32 23 45 115 37 29 71 39 23 21 37 20 12 13 135
## [37] 49 32 64 40 77 97 97 85 10 27 7 48 35 61 79 63 16 80
## [55] 108 20 52 82 50 64 59 39 9 16 78 35 66 122 89 110 44 28
## [73] 65 22 59 23 31 44 21 9 45 168 73 76 118 84 85 96 78 73
## [91] 91 47 32 20 23 21 24 44 21 28 9 13 46 18 13 24 16 13
## [109] 23 36 7 14 30 14 18 20
## attr(,"na.action")
## [1] 5 10 25 26 27 32 33 34 35 36 37 39 42 43 45 46 52 53 54
## [20] 55 56 57 58 59 60 61 65 72 75 83 84 102 103 107 115 119 150
## attr(,"class")
## [1] "omit"
airquality$Ozone
## [1] 41 36 12 18 NA 28 23 19 8 NA 7 16 11 14 18 14 34 6
## [19] 30 11 1 11 4 32 NA NA NA 23 45 115 37 NA NA NA NA NA
## [37] NA 29 NA 71 39 NA NA 23 NA NA 21 37 20 12 13 NA NA NA
## [55] NA NA NA NA NA NA NA 135 49 32 NA 64 40 77 97 97 85 NA
## [73] 10 27 NA 7 48 35 61 79 63 16 NA NA 80 108 20 52 82 50
## [91] 64 59 39 9 16 78 35 66 122 89 110 NA NA 44 28 65 NA 22
## [109] 59 23 31 44 21 9 NA 45 168 73 NA 76 118 84 85 96 78 73
## [127] 91 47 32 20 23 21 24 44 21 28 9 13 46 18 13 24 16 13
## [145] 23 36 7 14 30 NA 14 18 20
na.omit(airquality)
returns the object with observations removed if they contain any missing values
na.exclude(airquality)
differences between omitting and excluding NAs can be seen in some prediction and residual functions
na.fail(airquality)
returns the object only if it contains no missing values
na.pass(airquality)
returns the object unchanged
omit.model<- lm(Ozone ~ Solar.R, data = airquality, na.action = na.omit)
exclude.model<- lm(Ozone ~ Solar.R, data = airquality, na.action = na.exclude)
omit.model
##
## Call:
## lm(formula = Ozone ~ Solar.R, data = airquality, na.action = na.omit)
##
## Coefficients:
## (Intercept) Solar.R
## 18.5987 0.1272
exclude.model
##
## Call:
## lm(formula = Ozone ~ Solar.R, data = airquality, na.action = na.exclude)
##
## Coefficients:
## (Intercept) Solar.R
## 18.5987 0.1272
resid(omit.model)
## 1 2 3 4 7 8
## -1.7601294 2.3957702 -25.5463532 -40.4014578 -33.6211440 -12.1880897
## 9 12 13 14 15 16
## -13.0148679 -35.1530373 -44.4766565 -39.4420122 -8.8644704 -47.0719285
## 17 18 19 20 21 22
## -23.6384662 -22.5176190 -29.5459452 -13.1939997 -18.6160499 -48.2916147
## 23 24 28 29 30 31
## -17.7778596 1.7020672 2.7481237 -5.6443762 68.0434167 -17.0778386
## 38 40 41 44 47 48
## -5.7487173 15.3961782 -20.6731105 -14.4191880 -21.8872947 -17.7136649
## 49 50 51 62 63 64
## -3.3038428 -21.8585604 -23.0203700 82.1938142 -1.1357151 -16.6097319
## 66 67 68 69 70 71
## 23.1473497 -18.5286231 23.3036573 44.4481447 43.8123183 44.1473497
## 73 74 76 77 78 79
## -42.1703595 -13.8526503 -17.7026608 -3.6616984 -18.4420122 6.1591698
## 80 81 82 85 86 87
## 36.6213664 16.4249125 -3.4888847 24.0146824 61.0434167 -8.8991148
## 88 89 90 91 92 93
## 22.9737200 36.3150694 -3.5691775 13.2284585 8.1012932 9.8465547
## 94 95 99 100 101 104
## -12.6506943 -12.3904537 70.9741280 41.2804250 65.0780610 0.9855401
## 105 106 108 109 110 111
## -25.3148469 26.4363246 -5.6274621 33.9158434 -10.2227340 -18.6270541
## 112 113 114 116 117 118
## 1.2398706 -30.5345331 -14.1766776 -0.5577654 119.1359376 27.0607388
## 120 121 122 123 124 125
## 31.5867221 70.7890861 35.2631028 42.4942012 56.1646719 34.3497137
## 126 127 128 129 130 131
## 31.1300275 48.3670359 16.3205714 1.7020672 -30.6443762 -23.5750875
## 132 133 134 135 136 137
## -26.8467403 -27.5345331 -4.6097319 -30.5345331 -20.8640624 -12.6506943
## 138 139 140 141 142 143
## -19.8412382 -2.7368972 -29.0837486 -9.0321901 -24.8640624 -28.1589474
## 144 145 146 147 148 149
## -35.8640624 2.6209584 -0.2747005 -17.8298261 -7.1420332 -13.1416252
## 151 152 153
## -28.8872947 -17.2573784 -26.9565833
resid(exclude.model)
## 1 2 3 4 5 6
## -1.7601294 2.3957702 -25.5463532 -40.4014578 NA NA
## 7 8 9 10 11 12
## -33.6211440 -12.1880897 -13.0148679 NA NA -35.1530373
## 13 14 15 16 17 18
## -44.4766565 -39.4420122 -8.8644704 -47.0719285 -23.6384662 -22.5176190
## 19 20 21 22 23 24
## -29.5459452 -13.1939997 -18.6160499 -48.2916147 -17.7778596 1.7020672
## 25 26 27 28 29 30
## NA NA NA 2.7481237 -5.6443762 68.0434167
## 31 32 33 34 35 36
## -17.0778386 NA NA NA NA NA
## 37 38 39 40 41 42
## NA -5.7487173 NA 15.3961782 -20.6731105 NA
## 43 44 45 46 47 48
## NA -14.4191880 NA NA -21.8872947 -17.7136649
## 49 50 51 52 53 54
## -3.3038428 -21.8585604 -23.0203700 NA NA NA
## 55 56 57 58 59 60
## NA NA NA NA NA NA
## 61 62 63 64 65 66
## NA 82.1938142 -1.1357151 -16.6097319 NA 23.1473497
## 67 68 69 70 71 72
## -18.5286231 23.3036573 44.4481447 43.8123183 44.1473497 NA
## 73 74 75 76 77 78
## -42.1703595 -13.8526503 NA -17.7026608 -3.6616984 -18.4420122
## 79 80 81 82 83 84
## 6.1591698 36.6213664 16.4249125 -3.4888847 NA NA
## 85 86 87 88 89 90
## 24.0146824 61.0434167 -8.8991148 22.9737200 36.3150694 -3.5691775
## 91 92 93 94 95 96
## 13.2284585 8.1012932 9.8465547 -12.6506943 -12.3904537 NA
## 97 98 99 100 101 102
## NA NA 70.9741280 41.2804250 65.0780610 NA
## 103 104 105 106 107 108
## NA 0.9855401 -25.3148469 26.4363246 NA -5.6274621
## 109 110 111 112 113 114
## 33.9158434 -10.2227340 -18.6270541 1.2398706 -30.5345331 -14.1766776
## 115 116 117 118 119 120
## NA -0.5577654 119.1359376 27.0607388 NA 31.5867221
## 121 122 123 124 125 126
## 70.7890861 35.2631028 42.4942012 56.1646719 34.3497137 31.1300275
## 127 128 129 130 131 132
## 48.3670359 16.3205714 1.7020672 -30.6443762 -23.5750875 -26.8467403
## 133 134 135 136 137 138
## -27.5345331 -4.6097319 -30.5345331 -20.8640624 -12.6506943 -19.8412382
## 139 140 141 142 143 144
## -2.7368972 -29.0837486 -9.0321901 -24.8640624 -28.1589474 -35.8640624
## 145 146 147 148 149 150
## 2.6209584 -0.2747005 -17.8298261 -7.1420332 -13.1416252 NA
## 151 152 153
## -28.8872947 -17.2573784 -26.9565833
data.frame(resid(omit.model),resid(exclude.model)) # error
fitted(omit.model)
## 1 2 3 4 7 8 9 12
## 42.76013 33.60423 37.54635 58.40146 56.62114 31.18809 21.01487 51.15304
## 13 14 15 16 17 18 19 20
## 55.47666 53.44201 26.86447 61.07193 57.63847 28.51762 59.54595 24.19400
## 21 22 23 24 28 29 30 31
## 19.61605 59.29161 21.77786 30.29793 20.25188 50.64438 46.95658 54.07784
## 38 40 41 44 47 48 49 50
## 34.74872 55.60382 59.67311 37.41919 42.88729 54.71366 23.30384 33.85856
## 51 62 63 64 66 67 68 69
## 36.02037 52.80619 50.13572 48.60973 40.85265 58.52862 53.69634 52.55186
## 70 71 73 74 76 77 78 79
## 53.18768 40.85265 52.17036 40.85265 24.70266 51.66170 53.44201 54.84083
## 80 81 82 85 86 87 88 89
## 42.37863 46.57509 19.48888 55.98532 46.95658 28.89911 29.02628 45.68493
## 90 91 92 93 94 95 99 100
## 53.56918 50.77154 50.89871 29.15345 21.65069 28.39045 51.02587 47.71957
## 101 104 105 106 108 109 110 111
## 44.92194 43.01446 53.31485 38.56368 27.62746 25.08416 33.22273 49.62705
## 112 113 114 116 117 118 120 121
## 42.76013 51.53453 23.17668 45.55777 48.86406 45.93926 44.41328 47.21091
## 122 123 124 125 126 127 128 129
## 48.73690 42.50580 39.83533 43.65029 41.86997 42.63296 30.67943 30.29793
## 130 131 132 133 134 135 136 137
## 50.64438 46.57509 47.84674 51.53453 48.60973 51.53453 48.86406 21.65069
## 138 139 140 141 142 143 144 145
## 32.84124 48.73690 47.08375 22.03219 48.86406 44.15895 48.86406 20.37904
## 146 147 148 149 151 152 153
## 36.27470 24.82983 21.14203 43.14163 42.88729 35.25738 46.95658
fitted(exclude.model)
## 1 2 3 4 5 6 7 8
## 42.76013 33.60423 37.54635 58.40146 NA NA 56.62114 31.18809
## 9 10 11 12 13 14 15 16
## 21.01487 NA NA 51.15304 55.47666 53.44201 26.86447 61.07193
## 17 18 19 20 21 22 23 24
## 57.63847 28.51762 59.54595 24.19400 19.61605 59.29161 21.77786 30.29793
## 25 26 27 28 29 30 31 32
## NA NA NA 20.25188 50.64438 46.95658 54.07784 NA
## 33 34 35 36 37 38 39 40
## NA NA NA NA NA 34.74872 NA 55.60382
## 41 42 43 44 45 46 47 48
## 59.67311 NA NA 37.41919 NA NA 42.88729 54.71366
## 49 50 51 52 53 54 55 56
## 23.30384 33.85856 36.02037 NA NA NA NA NA
## 57 58 59 60 61 62 63 64
## NA NA NA NA NA 52.80619 50.13572 48.60973
## 65 66 67 68 69 70 71 72
## NA 40.85265 58.52862 53.69634 52.55186 53.18768 40.85265 NA
## 73 74 75 76 77 78 79 80
## 52.17036 40.85265 NA 24.70266 51.66170 53.44201 54.84083 42.37863
## 81 82 83 84 85 86 87 88
## 46.57509 19.48888 NA NA 55.98532 46.95658 28.89911 29.02628
## 89 90 91 92 93 94 95 96
## 45.68493 53.56918 50.77154 50.89871 29.15345 21.65069 28.39045 NA
## 97 98 99 100 101 102 103 104
## NA NA 51.02587 47.71957 44.92194 NA NA 43.01446
## 105 106 107 108 109 110 111 112
## 53.31485 38.56368 NA 27.62746 25.08416 33.22273 49.62705 42.76013
## 113 114 115 116 117 118 119 120
## 51.53453 23.17668 NA 45.55777 48.86406 45.93926 NA 44.41328
## 121 122 123 124 125 126 127 128
## 47.21091 48.73690 42.50580 39.83533 43.65029 41.86997 42.63296 30.67943
## 129 130 131 132 133 134 135 136
## 30.29793 50.64438 46.57509 47.84674 51.53453 48.60973 51.53453 48.86406
## 137 138 139 140 141 142 143 144
## 21.65069 32.84124 48.73690 47.08375 22.03219 48.86406 44.15895 48.86406
## 145 146 147 148 149 150 151 152
## 20.37904 36.27470 24.82983 21.14203 43.14163 NA 42.88729 35.25738
## 153
## 46.95658
na.omit and na.exclude do not use the missing values, but maintains their position for the residuals and fitted values.
summary(airquality$Ozone)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 1.00 18.00 31.50 42.13 63.25 168.00 37
table(airquality$Ozone)
##
## 1 4 6 7 8 9 10 11 12 13 14 16 18 19 20 21 22 23 24 27
## 1 1 1 3 1 3 1 3 2 4 4 4 4 1 4 4 1 6 2 1
## 28 29 30 31 32 34 35 36 37 39 40 41 44 45 46 47 48 49 50 52
## 3 1 2 1 3 1 2 2 2 2 1 1 3 2 1 1 1 1 1 1
## 59 61 63 64 65 66 71 73 76 77 78 79 80 82 84 85 89 91 96 97
## 2 1 1 2 1 1 1 2 1 1 2 1 1 1 1 2 1 1 1 2
## 108 110 115 118 122 135 168
## 1 1 1 1 1 1 1
table(airquality$Ozone,useNA="ifany")
##
## 1 4 6 7 8 9 10 11 12 13 14 16 18 19 20 21
## 1 1 1 3 1 3 1 3 2 4 4 4 4 1 4 4
## 22 23 24 27 28 29 30 31 32 34 35 36 37 39 40 41
## 1 6 2 1 3 1 2 1 3 1 2 2 2 2 1 1
## 44 45 46 47 48 49 50 52 59 61 63 64 65 66 71 73
## 3 2 1 1 1 1 1 1 2 1 1 2 1 1 1 2
## 76 77 78 79 80 82 84 85 89 91 96 97 108 110 115 118
## 1 1 2 1 1 1 1 2 1 1 1 2 1 1 1 1
## 122 135 168 <NA>
## 1 1 1 37
table(airquality$Ozone, useNA="always")
##
## 1 4 6 7 8 9 10 11 12 13 14 16 18 19 20 21
## 1 1 1 3 1 3 1 3 2 4 4 4 4 1 4 4
## 22 23 24 27 28 29 30 31 32 34 35 36 37 39 40 41
## 1 6 2 1 3 1 2 1 3 1 2 2 2 2 1 1
## 44 45 46 47 48 49 50 52 59 61 63 64 65 66 71 73
## 3 2 1 1 1 1 1 1 2 1 1 2 1 1 1 2
## 76 77 78 79 80 82 84 85 89 91 96 97 108 110 115 118
## 1 1 2 1 1 1 1 2 1 1 1 2 1 1 1 1
## 122 135 168 <NA>
## 1 1 1 37
length(airquality$Ozone)
## [1] 153
x1 <- sort(airquality$Ozone)
x1
## [1] 1 4 6 7 7 7 8 9 9 9 10 11 11 11 12 12 13 13
## [19] 13 13 14 14 14 14 16 16 16 16 18 18 18 18 19 20 20 20
## [37] 20 21 21 21 21 22 23 23 23 23 23 23 24 24 27 28 28 28
## [55] 29 30 30 31 32 32 32 34 35 35 36 36 37 37 39 39 40 41
## [73] 44 44 44 45 45 46 47 48 49 50 52 59 59 61 63 64 64 65
## [91] 66 71 73 73 76 77 78 78 79 80 82 84 85 85 89 91 96 97
## [109] 97 108 110 115 118 122 135 168
length(x1)
## [1] 116
x2 <- sort(airquality$Ozone, na.last = TRUE)
x2
## [1] 1 4 6 7 7 7 8 9 9 9 10 11 11 11 12 12 13 13
## [19] 13 13 14 14 14 14 16 16 16 16 18 18 18 18 19 20 20 20
## [37] 20 21 21 21 21 22 23 23 23 23 23 23 24 24 27 28 28 28
## [55] 29 30 30 31 32 32 32 34 35 35 36 36 37 37 39 39 40 41
## [73] 44 44 44 45 45 46 47 48 49 50 52 59 59 61 63 64 64 65
## [91] 66 71 73 73 76 77 78 78 79 80 82 84 85 85 89 91 96 97
## [109] 97 108 110 115 118 122 135 168 NA NA NA NA NA NA NA NA NA NA
## [127] NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
## [145] NA NA NA NA NA NA NA NA NA
length(x2)
## [1] 153
complete.cases, VIM::countNA(), mice::md.pattern()
x
## [1] 1 NA 3 NA 5 5 NA NA 3 3 NA 1 3 5 1 1
complete.cases(x) # tam satirlar
## [1] TRUE FALSE TRUE FALSE TRUE TRUE FALSE FALSE TRUE TRUE FALSE TRUE
## [13] TRUE TRUE TRUE TRUE
is.na(x)
## [1] FALSE TRUE FALSE TRUE FALSE FALSE TRUE TRUE FALSE FALSE TRUE FALSE
## [13] FALSE FALSE FALSE FALSE
!complete.cases(x)
## [1] FALSE TRUE FALSE TRUE FALSE FALSE TRUE TRUE FALSE FALSE TRUE FALSE
## [13] FALSE FALSE FALSE FALSE
str(airquality)
## 'data.frame': 153 obs. of 6 variables:
## $ Ozone : int 41 36 12 18 NA 28 23 19 8 NA ...
## $ Solar.R: int 190 118 149 313 NA NA 299 99 19 194 ...
## $ Wind : num 7.4 8 12.6 11.5 14.3 14.9 8.6 13.8 20.1 8.6 ...
## $ Temp : int 67 72 74 62 56 66 65 59 61 69 ...
## $ Month : int 5 5 5 5 5 5 5 5 5 5 ...
## $ Day : int 1 2 3 4 5 6 7 8 9 10 ...
dim(airquality)
## [1] 153 6
missing data olmayan satirlar
airquality[complete.cases(airquality), ]
dim(airquality[complete.cases(airquality), ])
## [1] 111 6
en az bir missing data olan satirlar
airquality[!complete.cases(airquality), ]
dim(airquality[!complete.cases(airquality), ])
## [1] 42 6
mantiksal operatorlerin sayisal degerleri 1 ve 0 oldugundan,
sum(is.na(airquality$Ozone)) # Dream degiskenindeki bos gozlem sayisi
## [1] 37
VIM::countNA(airquality$Ozone) # VIM paketi ile
## [1] 37
mean(is.na(sleep$Dream)) # Dream degiskenindeki bos gozlem orani
## [1] NaN
mean(!complete.cases(sleep)) # veri setinde en az bir bos gozlem olan satir orani
## [1] 0
missing data pattern
mice::md.pattern(airquality) # mice paketi
## Wind Temp Month Day Solar.R Ozone
## 111 1 1 1 1 1 1 0
## 35 1 1 1 1 1 0 1
## 5 1 1 1 1 0 1 1
## 2 1 1 1 1 0 0 2
## 0 0 0 0 7 37 44
** missing data visualization**
VIM package
a<-VIM::aggr(airquality, prop=FALSE, numbers=TRUE) # VIM paketi
summary(a) # missing data oruntusu
##
## Missings per variable:
## Variable Count
## Ozone 37
## Solar.R 7
## Wind 0
## Temp 0
## Month 0
## Day 0
##
## Missings in combinations of variables:
## Combinations Count Percent
## 0:0:0:0:0:0 111 72.549020
## 0:1:0:0:0:0 5 3.267974
## 1:0:0:0:0:0 35 22.875817
## 1:1:0:0:0:0 2 1.307190
VIM::matrixplot(airquality)
##
## Click in a column to sort by the corresponding variable.
## To regain use of the VIM GUI and the R console, click outside the plot region.
VIM::matrixplot(airquality, interactive = TRUE, sortby = "Ozone") #!
##
## Click in a column to sort by the corresponding variable.
## To regain use of the VIM GUI and the R console, click outside the plot region.
VIM::marginplot(airquality[,c("Ozone","Solar.R")])
VIM::marginplot(airquality[c("Ozone","Solar.R")], pch=c(20),col=c("darkgray", "red", "blue"))
VIM::marginmatrix(airquality[,-5])
VIM::barMiss(airquality[,c("Month","Ozone")])
##
## Click in in the left margin to switch to the previous variable or in the right margin to switch to the next variable.
## To regain use of the VIM GUI and the R console, click anywhere else in the graphics window.
airquality[,c("Month","Ozone")] # grafik verisini gormek icin
plot(airquality$Ozone, airquality$Solar.R)
VIM::rugNA(airquality$Ozone, airquality$Solar.R,side=1) # y eksenindeki missingleri x ekseni uzerinde gosteriyor
VIM::rugNA(airquality$Ozone, airquality$Solar.R,ticksize = 1, col= "orange", side=2) # miss argumani ekleyerek missing yerine imputed veriler gosterilebilir ?rugNA bakiniz
VIM::scattmatrixMiss(airquality) # all variables highlighted, delimiter argumani imputed degerler icin kullanilir, ?scattmatrixMiss
##
## Click in a diagonal panel to add to or remove from the highlight selection.
## To regain use of the VIM GUI and the R console, click anywhere else in the graphics window.
##
## Highlighted missings in any of the variables 'Ozone', 'Solar.R', 'Wind', 'Temp', 'Month', 'Day'.
VIM::scattmatrixMiss(airquality, highlight = "Ozone")
##
## Click in a diagonal panel to add to or remove from the highlight selection.
## To regain use of the VIM GUI and the R console, click anywhere else in the graphics window.
##
## Highlighted 'missings' in variable 'Ozone'.
VIM::pbox(airquality) # parallel boxplots obs ve miss icin
## Warning in createPlot(main, sub, xlab, ylab, labels, ca$at): not enough space
## to display frequencies
##
## Click in in the left margin to switch to the previous variable or in the right margin to switch to the next variable.
## To regain use of the VIM GUI and the R console, click anywhere else in the graphics window.
VIM::parcoordMiss(airquality) #paralel koordinatlar grafigi
##
## Click on a coordinate axis to add to or remove from the highlight selection.
## Click in the top margin to toggle visualizing missing values in the plot variables.
## To regain use of the VIM GUI and the R console, click in any of the other plot margins.
##
## Highlighted missings in any of the variables 'Ozone', 'Solar.R', 'Wind', 'Temp', 'Month', 'Day'.
VIM::scattJitt(airquality[,1:2]) # Ozone ve Solar.R icin jittered scatterplot
plot(airquality[,1:2])
VIM::spineMiss(airquality[,c("Month","Solar.R")]) # spineplot /spinogram
##
## Click in in the left margin to switch to the previous variable or in the right margin to switch to the next variable.
## To regain use of the VIM GUI and the R console, click anywhere else in the graphics window.
VIM::scattMiss(airquality[,c("Ozone","Solar.R")]) # missing data line ile gosterilir
##
## Click in bottom or left margin to change the 'side' argument accordingly.
## To regain use of the VIM GUI and the R console, click anywhere else in the graphics window.
VIM::mosaicMiss(airquality, high = 4, plotvars = 5:6) # mosaic plot
Amelia paketi ile gorsellestirme
Amelia::missmap(airquality)
Bugun <- Sys.Date()
Bugun
## [1] "2024-01-01"
class(Bugun)
## [1] "Date"
typeof(Bugun)
## [1] "double"
mode(Bugun)
## [1] "numeric"
Bu.an<-Sys.time()
Bu.an
## [1] "2024-01-01 19:49:48 +03"
class(Bu.an)
## [1] "POSIXct" "POSIXt"
typeof(Bu.an)
## [1] "double"
mode(Bu.an)
## [1] "numeric"
date()
## [1] "Mon Jan 1 19:49:48 2024"
YYYY-MM-DD
yarin <- as.Date("2018-05-05")
yarin
## [1] "2018-05-05"
weekdays(Bugun)
## [1] "Monday"
haftaya<-Bugun+7
haftaya
## [1] "2024-01-08"
x1<-1:15
Bugun + x1
## [1] "2024-01-02" "2024-01-03" "2024-01-04" "2024-01-05" "2024-01-06"
## [6] "2024-01-07" "2024-01-08" "2024-01-09" "2024-01-10" "2024-01-11"
## [11] "2024-01-12" "2024-01-13" "2024-01-14" "2024-01-15" "2024-01-16"
seq(Bugun, by=2, length.out = 15)
## [1] "2024-01-01" "2024-01-03" "2024-01-05" "2024-01-07" "2024-01-09"
## [6] "2024-01-11" "2024-01-13" "2024-01-15" "2024-01-17" "2024-01-19"
## [11] "2024-01-21" "2024-01-23" "2024-01-25" "2024-01-27" "2024-01-29"
use different date format
as.Date("Jan-10-2018", format = "%b-%d-%Y")
## [1] "2018-01-10"
as.Date("January-10-2018", format = "%b-%d-%Y")
## [1] "2018-01-10"
dates <- c("02/27/92", "02/27/92", "01/14/92", "02/28/92", "02/01/92")
as.Date(dates, "%m/%d/%y")
## [1] "1992-02-27" "1992-02-27" "1992-01-14" "1992-02-28" "1992-02-01"
%Y = year in extended form: 2018 %y = year in short form: 18 %B = month in extended form: February %b = month in short form : Feb %m = month in numeric form : 2 %d = day of the month : 15 %j = month of the year : 2 %H = hour(24 hours) : 22 %I = hour (12 hours) : 10 %M = minutes %S = seconds
to specify time zone, use PoSIXct(), POSIXIt()
zaman1 <- "May 5, 2018, 12:57:10"
zaman1.format <- "%B %d, %Y, %H:%M:%S"
zaman1.ct <- as.POSIXct(zaman1, format=zaman1.format, tz="GMT")
zaman1.ct
## [1] "2018-05-05 12:57:10 GMT"
class(zaman1.ct)
## [1] "POSIXct" "POSIXt"
typeof(zaman1.ct)
## [1] "double"
mode(zaman1.ct)
## [1] "numeric"
zaman1.lt<- as.POSIXlt(zaman1.ct)
zaman1.lt
## [1] "2018-05-05 12:57:10 GMT"
class(zaman1.lt)
## [1] "POSIXlt" "POSIXt"
typeof(zaman1.lt)
## [1] "list"
mode(zaman1.lt)
## [1] "list"
unclass(zaman1.lt)
## $sec
## [1] 10
##
## $min
## [1] 57
##
## $hour
## [1] 12
##
## $mday
## [1] 5
##
## $mon
## [1] 4
##
## $year
## [1] 118
##
## $wday
## [1] 6
##
## $yday
## [1] 124
##
## $isdst
## [1] 0
##
## attr(,"tzone")
## [1] "GMT"
parse date: read date from text
zaman2 <- c("15:10:00 20/03/2018", "16:00:35 19/01/2017",
"10:20:30 05/12/2017", "12:15:15 30/06/2018",
"11:20:35 21/04/2017", "15:50:00 11/09/2018")
zaman2_str <- strptime(zaman2, "%H:%M:%S %d/%m/%Y",tz = "UTC")
zaman2_str
## [1] "2018-03-20 15:10:00 UTC" "2017-01-19 16:00:35 UTC"
## [3] "2017-12-05 10:20:30 UTC" "2018-06-30 12:15:15 UTC"
## [5] "2017-04-21 11:20:35 UTC" "2018-09-11 15:50:00 UTC"
mode(zaman2_str)
## [1] "list"
for more library(lubridate)
test
is.numeric(), is.character(), is.vector(), is.matrix(), is.data.frame(),
is.factor(), is.logical(), is.numeric()
convert
as.character(), as.vector(), as.matrix(), as.data.frame(),
as.factor(), as.logical()
more with libraries
mtcars <- mtcars[order(mtcars$mpg),]
head(mtcars)
mtcars1 <- mtcars[order(mtcars$mpg, mtcars$disp),] # 1st then by 2nd
head(mtcars1)
mtcars2 <- mtcars[order(mtcars$mpg,-mtcars$qsec),] # - desc
head(mtcars2)
x <- data.frame(k1 = c(NA,NA,3,4,5), k2 = c(1,NA,NA,4,5), data = 1:5)
y <- data.frame(k1 = c(NA,2,NA,4,5), k2 = c(NA,NA,3,4,5), data = 1:5)
cbind(x,y)
inner join
x
y
merge(x, y, by = "k1") # NA's match, so 6 rows
merge(x, y, by = "k2", incomparables = NA) # 2 rows
merge(x, y, by = c("k1","k2")) # NA's match
2.add row
rbind(x,y)
dataframe[row indices, column indices]
a1 <- mtcars[, c(2,4)]
head(a1)
isim1 <- c("cyl", "hp", "qsec")
a2 <-mtcars[isim1]
head(a2)
a3 <- mtcars[c(-2,-4)]
head(a3)
a4 <- mtcars
a4$carb <- a4$wt <- NULL
head(a4)
a5 <- mtcars[1:3,]
head(a5)
a6<- mtcars[which(mtcars$cyl==6 & mtcars$disp == 160),]
head(a6)
library(sqldf)
## Loading required package: gsubfn
## Loading required package: proto
## Warning in system2("/usr/bin/otool", c("-L", shQuote(DSO)), stdout = TRUE):
## running command ''/usr/bin/otool' -L
## '/Library/Frameworks/R.framework/Resources/library/tcltk/libs//tcltk.so'' had
## status 1
## Loading required package: RSQLite
## Warning: package 'RSQLite' was built under R version 4.2.3
newdf <- sqldf("select * from mtcars where carb=1 order by mpg",
row.names=TRUE)
sqldf("select avg(mpg) as avg_mpg, avg(disp) as avg_disp, gear
from mtcars where cyl in (4, 6) group by gear")
crashes <- read.csv("crashes.csv")
roads <- read.csv("roads.csv")
head(crashes)
print(roads)
## Road District Length
## 1 Interstate 65 Greenfield 262
## 2 Interstate 70 Vincennes 156
## 3 US-36 Crawfordsville 139
## 4 US-40 Greenfield 150
## 5 US-52 Crawfordsville 172
join_string <- "select crashes.* , roads.District, roads.Length
from crashes
left join roads
on crashes.Road = roads.Road"
join_string
## [1] "select crashes.* , roads.District, roads.Length\n from crashes\n left join roads\n on crashes.Road = roads.Road"
crashes_join_roads <- sqldf(join_string, stringsAsFactors = FALSE)
crashes_join_roads
join_string2 <- "select crashes.* , roads.District, roads.Length
from crashes
inner join roads
on crashes.Road = roads.Road"
crashes_join_roads2<- sqldf(join_string2, stringsAsFactors = FALSE)
head(crashes_join_roads2)
The merge statement in base R can perform the equivalent of inner and left joins, as well as right and full outer joins, which are unavailable in sqldf.
crashes_merge_roads <- merge(crashes, roads, by = c("Road"))
crashes_merge_roads
crashes_merge_roads2 <- merge(crashes, roads, by = c("Road"), all.x = TRUE)
crashes_merge_roads2
crashes_merge_roads3 <- merge(crashes, roads, by = c("Road"), all.y = TRUE)
crashes_merge_roads3
crashes_merge_roads4 <- merge(crashes, roads, by = c("Road"), all.x = TRUE,
all.y = TRUE)
crashes_merge_roads4
Modifying the inner join query to include a where is the equivalent of combining merge and subset statements.
join_string2 <- "select crashes.* , roads.District, roads.Length
from crashes
inner join roads
on crashes.Road = roads.Road
where crashes.Road = 'US-40'"
crashes_join_roads4 <- sqldf(join_string2,stringsAsFactors = FALSE)
crashes_join_roads4
Aggregate functions available using SQLite can be used through the use of a group by clause.
group_string <- "select crashes.Road, avg(crashes.N_Crashes) as Mean_Crashes
from crashes
left join roads
on crashes.Road = roads.Road
group by 1"
sqldf(group_string)
While sqldf can make certain data manipulation operations easier, more advanced data manipulation tasks and calculations must be performed in R, such as using Hadley Wickham’s plyr package.
plyr::ddply(crashes_merge_roads,
c("Road"),
function(X) data.frame(Mean_Crashes = mean(X$N_Crashes),
Q1_Crashes = quantile(X$N_Crashes, 0.25),
Q3_Crashes = quantile(X$N_Crashes, 0.75),
Median_Crashes = quantile(X$N_Crashes, 0.50))
)
c1 <- "Mustafa Akgül"
c2 <- "Özgür Yazılım Kış Kampı"
paste(c1, c2)
## [1] "Mustafa Akgül Özgür Yazılım Kış Kampı"
paste("The life of", pi)
## [1] "The life of 3.14159265358979"
paste("I", "love", "R")
## [1] "I love R"
paste("I", "love", "R", sep = "-")
## [1] "I-love-R"
paste0("I", "love", "R")
## [1] "IloveR"
paste("R", 1:5, sep = " v1.")
## [1] "R v1.1" "R v1.2" "R v1.3" "R v1.4" "R v1.5"
paste0("R", 1:5, sep = " v1.")
## [1] "R1 v1." "R2 v1." "R3 v1." "R4 v1." "R5 v1."
c3<-pi
c3<-as.character(pi)
c3
## [1] "3.14159265358979"
toString(c("Aug", 24, 1980))
## [1] "Aug, 24, 1980"
print(): generic printingnoquote(): print with no quotescat(): concatenate and print with no quotessprintf(): a wrapper for the C function sprintf, that
returns a character vector containing a formatted combination of text
and variable valuesprint(c2)
## [1] "Özgür Yazılım Kış Kampı"
print(c2, quote = FALSE)
## [1] Özgür Yazılım Kış Kampı
noquote(c2)
## [1] Özgür Yazılım Kış Kampı
Another very useful function is cat() which allows us to
concatenate objects and print them either on screen or to a file. The
output result is very similar to noquote(); however,
cat() does not print the numeric line indicator. As a
result, cat() can be useful for printing nicely formatted
responses to users.
noquote(c2)
## [1] Özgür Yazılım Kış Kampı
cat(c2)
## Özgür Yazılım Kış Kampı
cat(c2, "2020")
## Özgür Yazılım Kış Kampı 2020
cat(letters)
## a b c d e f g h i j k l m n o p q r s t u v w x y z
cat(letters, sep = "-")
## a-b-c-d-e-f-g-h-i-j-k-l-m-n-o-p-q-r-s-t-u-v-w-x-y-z
cat(letters, sep = "")
## abcdefghijklmnopqrstuvwxyz
long strings
cat(c1,c2, fill=1)
## Mustafa Akgül
## Özgür Yazılım Kış Kampı
sprintf() is a useful printing function for precise
control of the output. It is a wrapper for the C function sprintf and
returns a character vector containing a formatted combination of text
and variable values.To substitute in a string or string variable, use
%s:
c2
## [1] "Özgür Yazılım Kış Kampı"
sprintf("Mustafa Akgül %s 2020", c2)
## [1] "Mustafa Akgül Özgür Yazılım Kış Kampı 2020"
sprintf("Hoşgeldin %s %s 2020",c1,c2)
## [1] "Hoşgeldin Mustafa Akgül Özgür Yazılım Kış Kampı 2020"
r<-3
sprintf("Kullandığım R versiyonu:%d",r)
## [1] "Kullandığım R versiyonu:3"
print with leading spaces
r<-3
sprintf("Kullandığım R versiyonu:%4d",r)
## [1] "Kullandığım R versiyonu: 3"
can also lead with zeros
r<-3
sprintf("Kullandığım R versiyonu:%04d",r)
## [1] "Kullandığım R versiyonu:0003"
For floating-point numbers, use %f for standard notation, and %e or %E for exponential notation:
‘%f’ indicates ‘fixed point’ decimal notation
sprintf("%f", pi)
## [1] "3.141593"
decimal notation with 3 decimal digits
sprintf("%.3f", pi)
## [1] "3.142"
1 integer and 0 decimal digits
sprintf("%1.0f", pi)
## [1] "3"
decimal notation with 5 total decimal digits and only 1 to the right of the decimal point
sprintf("%5.1f", pi)
## [1] " 3.1"
fill empty digits with zeros
sprintf("%05.1f", pi)
## [1] "003.1"
print with sign (positive)
sprintf("%+f", pi)
## [1] "+3.141593"
prefix a space
sprintf("% f", pi)
## [1] " 3.141593"
exponential decimal notation ‘e’
sprintf("%e", pi)
## [1] "3.141593e+00"
exponential decimal notation ‘E’
sprintf("%E", pi)
## [1] "3.141593E+00"
length("Bir berber bir berbere gel beraber bir berber dükkanı açalım demiş")
## [1] 1
length(c("Bir","berber","bir","berbere","gel","beraber","bir","berber", "dükkanı","açalım","demiş"))
## [1] 11
nchar("Bir berber bir berbere gel beraber bir berber dükkanı açalım demiş")
## [1] 66
nchar(c("Bir","berber","bir","berbere","gel","beraber","bir","berber", "dükkanı","açalım","demiş"))
## [1] 3 6 3 7 3 7 3 6 7 6 5
#string manupilation with base R
To replace a character/s
x <- "This is A string."
chartr(old = "A", new = "a", x)
## [1] "This is a string."
replace any ‘d’ with ‘t’ and any ‘z’ with ‘a’
y <- "Tomorrow I plzn do lezrn zbout dexduzl znzlysis."
chartr(old = "dz", new = "ta", y)
## [1] "Tomorrow I plan to learn about textual analysis."
Note that chartr() replaces every identified letter for replacement so the only time I use it is when I am certain that I want to change every possible occurrence of a letter.
String Abbreviations
streets <- c("Main", "Elm", "Riverbend", "Mario", "Frederick")
# default abbreviations
abbreviate(streets)
## Main Elm Riverbend Mario Frederick
## "Main" "Elm" "Rvrb" "Mari" "Frdr"
# set minimum length of abbreviation
abbreviate(streets, minlength = 2)
## Main Elm Riverbend Mario Frederick
## "Mn" "El" "Rv" "Mr" "Fr"
Extract/Replace Substrings
To extract or replace substrings in a character vector there are three primary base R functions to use: substr(), substring(), and strsplit(). The purpose of substr() is to extract and replace substrings with specified starting and stopping characters:
alphabet <- paste(LETTERS, collapse = "")
# extract 18th character in string
substr(alphabet, start = 18, stop = 18)
## [1] "R"
# extract 18-24th characters in string
substr(alphabet, start = 18, stop = 24)
## [1] "RSTUVWX"
# replace 19-24th characters with `R`
substr(alphabet, start = 19, stop = 24) <- "RRRRRR"
alphabet
## [1] "ABCDEFGHIJKLMNOPQRRRRRRRYZ"
The purpose of substring() is to extract and replace substrings with only a specified starting point. substring() also allows you to extract/replace in a recursive fashion:
alphabet <- paste(LETTERS, collapse = "")
# extract 18th through last character
substring(alphabet, first = 18)
## [1] "RSTUVWXYZ"
recursive extraction; specify start position only
substring(alphabet, first = 18:24)
## [1] "RSTUVWXYZ" "STUVWXYZ" "TUVWXYZ" "UVWXYZ" "VWXYZ" "WXYZ"
## [7] "XYZ"
recursive extraction; specify start and stop positions
substring(alphabet, first = 1:5, last = 3:7)
## [1] "ABC" "BCD" "CDE" "DEF" "EFG"
To split the elements of a character string use strsplit():
z <- "The day after I will take a break and drink a beer."
strsplit(z, split = " ")
## [[1]]
## [1] "The" "day" "after" "I" "will" "take" "a" "break" "and"
## [10] "drink" "a" "beer."
a <- "Alabama-Alaska-Arizona-Arkansas-California"
strsplit(a, split = "-")
## [[1]]
## [1] "Alabama" "Alaska" "Arizona" "Arkansas" "California"
unlist(strsplit(a, split = "-"))
## [1] "Alabama" "Alaska" "Arizona" "Arkansas" "California"
There are three stringr functions that are closely related to their base R equivalents, but with a few enhancements:
str_c() is equivalent to the paste() functions:
same as paste0()
library(stringr)
## Warning: package 'stringr' was built under R version 4.2.3
str_c("Learning", "to", "use", "the", "stringr", "package")
## [1] "Learningtousethestringrpackage"
same as paste()
str_c("Learning", "to", "use", "the", "stringr", "package", sep = " ")
## [1] "Learning to use the stringr package"
allows recycling
str_c(letters, " is for", "...")
## [1] "a is for..." "b is for..." "c is for..." "d is for..." "e is for..."
## [6] "f is for..." "g is for..." "h is for..." "i is for..." "j is for..."
## [11] "k is for..." "l is for..." "m is for..." "n is for..." "o is for..."
## [16] "p is for..." "q is for..." "r is for..." "s is for..." "t is for..."
## [21] "u is for..." "v is for..." "w is for..." "x is for..." "y is for..."
## [26] "z is for..."
str_length() is similar to the nchar() function; however, str_ length() behaves more appropriately with missing (‘NA’) values:
some text with NA
text = c("Learning", "to", NA, "use", "the", NA, "stringr", "package")
compare str_length() with nchar()
nchar(text)
## [1] 8 2 NA 3 3 NA 7 7
str_length(text)
## [1] 8 2 NA 3 3 NA 7 7
str_sub() is similar to substr(); however, it returns a zero length vector if any of its inputs are zero length, and otherwise expands each argument to match the longest. It also accepts negative positions, which are calculated from the left of the last character.
x <- "Learning to use the stringr package"
alternative indexing
str_sub(x, start = 1, end = 15)
## [1] "Learning to use"
str_sub(x, end = 15)
## [1] "Learning to use"
str_sub(x, start = 17)
## [1] "the stringr package"
str_sub(x, start = c(1, 17), end = c(15, 35))
## [1] "Learning to use" "the stringr package"
using negative indices for start/end points from end of string
str_sub(x, start = -1)
## [1] "e"
str_sub(x, start = -19)
## [1] "the stringr package"
str_sub(x, end = -21)
## [1] "Learning to use"
Replacement
str_sub(x, end = 15) <- "I know how to use"
x
## [1] "I know how to use the stringr package"
str_dup("beer", times = 3)
## [1] "beerbeerbeer"
str_dup("beer", times = 1:3)
## [1] "beer" "beerbeer" "beerbeerbeer"
use with a vector of strings
states_i_luv <- state.name[c(6, 23, 34, 35)]
str_dup(states_i_luv, times = 2)
## [1] "ColoradoColorado" "MinnesotaMinnesota"
## [3] "North DakotaNorth Dakota" "OhioOhio"
Remove Leading and Trailing Whitespace
text <- c("Text ", " with", " whitespace ", " on", "both ", " sides ")
remove whitespaces on the left side
str_trim(text, side = "left")
## [1] "Text " "with" "whitespace " "on" "both "
## [6] "sides "
remove whitespaces on the right side
str_trim(text, side = "right")
## [1] "Text" " with" " whitespace" " on" "both"
## [6] " sides"
remove whitespaces on both sides
str_trim(text, side = "both")
## [1] "Text" "with" "whitespace" "on" "both"
## [6] "sides"
To add whitespace, or to pad a string, use str_pad(). You can also use str_ pad() to pad a string with specified characters.
str_pad("beer", width = 10, side = "left")
## [1] " beer"
str_pad("beer", width = 10, side = "both")
## [1] " beer "
str_pad("beer", width = 10, side = "right", pad = "!")
## [1] "beer!!!!!!"
Set Union
set_1 <- c("lagunitas", "bells", "dogfish", "summit", "odell")
set_2 <- c("sierra", "bells", "harpoon", "lagunitas", "founders")
union(set_1, set_2)
## [1] "lagunitas" "bells" "dogfish" "summit" "odell" "sierra"
## [7] "harpoon" "founders"
Set Intersection To obtain the common elements of two character vectors use intersect():
intersect(set_1, set_2)
## [1] "lagunitas" "bells"
To obtain the non-common elements, or the difference, of two character vectors use setdiff():
returns elements in set_1 not in set_2
setdiff(set_1, set_2)
## [1] "dogfish" "summit" "odell"
returns elements in set_2 not in set_1
setdiff(set_2, set_1)
## [1] "sierra" "harpoon" "founders"
To test if two vectors contain the same elements regardless of order use setequal():
set_3 <- c("woody", "buzz", "rex")
set_4 <- c("woody", "andy", "buzz")
set_5 <- c("andy", "buzz", "woody")
setequal(set_3, set_4)
## [1] FALSE
setequal(set_4, set_5)
## [1] TRUE
To test if two character vectors are equal in content and order use identical():
set_6 <- c("woody", "andy", "buzz")
set_7 <- c("andy", "buzz", "woody")
set_8 <- c("woody", "andy", "buzz")
identical(set_6, set_7)
## [1] FALSE
identical(set_6, set_8)
## [1] TRUE
Identifying If Elements Are Contained in a String To test if an element is contained within a character vector use is.element() or %in%:
good <- "andy"
bad <- "sid"
is.element(good, set_8)
## [1] TRUE
good %in% set_8
## [1] TRUE
bad %in% set_8
## [1] FALSE
sort(set_8)
## [1] "andy" "buzz" "woody"
sort(set_8, decreasing = TRUE)
## [1] "woody" "buzz" "andy"
A regular expression (aka regex) is a sequence of characters that define a search pattern, mainly for use in pattern matching with text strings.
help(regex)